In [50]:
print("Hello")
Hello

Step 1: Load the dataset into Python

In [51]:
import pandas as pd

# Load your dataset
df = pd.read_csv("C://Users//sangr//Downloads//new_credit_risk_dataset.csv")

# Quick look at the data
print(df.head())
print(df.info())
print(df.isnull().sum())  # check missing values
   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT             1230.0   
1          21           9600                   OWN               50.0   
2          25           9600              MORTGAGE               10.0   
3          23          65500                  RENT               40.0   
4          24          54400                  RENT               80.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  
1                 0.10                         N                           2  
2                 0.57                         N                           3  
3                 0.53                         N                           2  
4                 0.55                         Y                           4  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB
None
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Define Features & Target

In [52]:
# Define target and features
X = df.drop(columns=["loan_status"])  # features
y = df["loan_status"]  # target

Train-Test Split

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Evaluate Model

In [54]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming X_train, X_test, y_train, y_test are already defined

# First, identify which columns contain categorical data
# For this example, let's assume we need to find the column index for 'MORTGAGE'
# You should replace this with code that identifies all categorical columns in your dataset
import pandas as pd

# Assuming X_train is a DataFrame
# Find categorical columns (columns with object or category dtype)
categorical_columns = []
for i, dtype in enumerate(X_train.dtypes):
    if dtype == 'object' or dtype.name == 'category':
        categorical_columns.append(i)

# If X_train is not a DataFrame but a numpy array, you'll need to specify the indices manually
# For example, if you know 'MORTGAGE' is in column 3:
# categorical_columns = [3]  # Add all categorical column indices here

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'  # Keep other columns as they are
)

# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
Accuracy: 0.9330980512505754

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.97      0.72      0.82      1422

    accuracy                           0.93      6517
   macro avg       0.95      0.86      0.89      6517
weighted avg       0.94      0.93      0.93      6517


Confusion Matrix:
 [[5059   36]
 [ 400 1022]]

Improvements¶

Try Tree-Based Models¶

In [57]:
# First, install the xgboost package
!pip install xgboost

# Then import the required libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming X_train, X_test, y_train, y_test are already defined
# First, identify categorical columns (this is an example - adjust based on your actual data)
# Let's assume 'MORTGAGE' is in a column named 'home_ownership'
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Random Forest with preprocessing
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("\nRandom Forest Report:\n", classification_report(y_test, rf_pred))

# XGBoost with preprocessing
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

xgb_pipeline.fit(X_train, y_train)
xgb_pred = xgb_pipeline.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("\nXGBoost Report:\n", classification_report(y_test, xgb_pred))
Requirement already satisfied: xgboost in c:\user\p\lib\site-packages (3.0.5)
Requirement already satisfied: numpy in c:\user\p\lib\site-packages (from xgboost) (2.1.3)
Requirement already satisfied: scipy in c:\user\p\lib\site-packages (from xgboost) (1.15.3)
Random Forest Accuracy: 0.9340187202700629

Random Forest Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.97      0.72      0.83      1422

    accuracy                           0.93      6517
   macro avg       0.95      0.86      0.89      6517
weighted avg       0.94      0.93      0.93      6517

XGBoost Accuracy: 0.9350928341261316

XGBoost Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.96      0.73      0.83      1422

    accuracy                           0.94      6517
   macro avg       0.94      0.86      0.90      6517
weighted avg       0.94      0.94      0.93      6517

C:\User\p\Lib\site-packages\xgboost\training.py:183: UserWarning: [13:17:03] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
In [105]:
#Handle Class Imbalance
In [ ]:
 
In [59]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer  # Import SimpleImputer for handling missing values

# Assuming X_train and X_test are pandas DataFrames
# First, identify categorical columns (those with string values like 'MORTGAGE')
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X_train.select_dtypes(exclude=['object']).columns.tolist()

# Create a preprocessor that will handle both numerical and categorical features
# Add imputation steps to handle missing values
preprocessor = ColumnTransformer(
    transformers=[
        # For numerical columns: first impute missing values with the mean
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('passthrough', 'passthrough')
        ]), numerical_columns),
        # For categorical columns: first impute missing values with the most frequent value, then one-hot encode
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns)
    ])

# Create a pipeline that first preprocesses the data, then applies the model
model_balanced = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight="balanced"))
])

# Fit the pipeline to the training data
model_balanced.fit(X_train, y_train)

# Make predictions
y_pred_bal = model_balanced.predict(X_test)

# Evaluate the model
print("Balanced Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_bal))
print("\nReport:\n", classification_report(y_test, y_pred_bal))
Balanced Logistic Regression Accuracy: 0.7759705385913764

Report:
               precision    recall  f1-score   support

           0       0.92      0.78      0.85      5095
           1       0.49      0.76      0.60      1422

    accuracy                           0.78      6517
   macro avg       0.71      0.77      0.72      6517
weighted avg       0.83      0.78      0.79      6517

C:\User\p\Lib\site-packages\sklearn\linear_model\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Feature Importance

Random Forest / XGBoost feature importances:

In [61]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Identify categorical & numeric columns
cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(exclude=['object']).columns

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

# Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# Fit model
rf_pipeline.fit(X_train, y_train)

# Get feature importances
importances = rf_pipeline.named_steps['model'].feature_importances_
feature_names = rf_pipeline.named_steps['preprocessor'].get_feature_names_out()
feat_importances = pd.Series(importances, index=feature_names)
print(feat_importances.sort_values(ascending=False).head(10))
num__loan_percent_income               0.227442
num__person_income                     0.139640
num__loan_int_rate                     0.108983
num__loan_amnt                         0.069097
num__person_emp_length                 0.062406
cat__loan_grade_D                      0.059890
cat__person_home_ownership_RENT        0.053760
num__person_age                        0.043827
num__cb_person_cred_hist_length        0.034204
cat__person_home_ownership_MORTGAGE    0.027569
dtype: float64
In [62]:
#Tune Hyperparameters (GridSearchCV / RandomizedSearchCV)
In [63]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # your earlier ColumnTransformer
    ('model', XGBClassifier(
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    ))
])

param_grid = {
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__n_estimators': [100, 200, 500],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

grid = GridSearchCV(
    xgb_pipeline,
    param_grid,
    scoring='recall',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)
print("Best Recall Score:", grid.best_score_)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
C:\User\p\Lib\site-packages\xgboost\training.py:183: UserWarning: [13:19:10] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 500, 'model__subsample': 1.0}
Best Recall Score: 0.7525504789048091
In [ ]:
 

Evaluate on the Test Set¶

Now that you have tuned hyperparameters, retrain the model on the full training data and evaluate on the test data:

In [64]:
best_xgb = grid.best_estimator_

y_pred = best_xgb.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
Test Accuracy: 0.9361669479822003

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.94      0.75      0.84      1422

    accuracy                           0.94      6517
   macro avg       0.94      0.87      0.90      6517
weighted avg       0.94      0.94      0.93      6517


Confusion Matrix:
 [[5029   66]
 [ 350 1072]]
In [65]:
##Feature Importance (XGBoost specific)
In [66]:
importances = best_xgb.named_steps['model'].feature_importances_
feature_names = best_xgb.named_steps['preprocessor'].get_feature_names_out()

feat_imp = pd.Series(importances, index=feature_names)
print(feat_imp.sort_values(ascending=False).head(15))
cat__loan_grade_D                     0.137872
cat__person_home_ownership_RENT       0.110663
cat__person_home_ownership_OWN        0.097535
cat__loan_grade_E                     0.066564
cat__loan_grade_C                     0.061964
cat__loan_grade_G                     0.053514
cat__loan_grade_A                     0.052265
cat__loan_intent_DEBTCONSOLIDATION    0.048179
num__loan_percent_income              0.046322
cat__loan_grade_F                     0.042818
cat__loan_intent_MEDICAL              0.038055
cat__loan_intent_HOMEIMPROVEMENT      0.033725
cat__loan_intent_VENTURE              0.033176
cat__loan_grade_B                     0.024915
num__person_income                    0.020623
dtype: float32
In [ ]:
 
In [67]:
import pandas as pd

# New applicant details
new_applicant = pd.DataFrame([{
    "person_age": 30,
    "person_income": 45000,
    "person_home_ownership": "RENT",
    "person_emp_length": 24,  # months
    "loan_intent": "PERSONAL",
    "loan_grade": "C",
    "loan_amnt": 8000,
    "loan_int_rate": 12.5,
    "loan_percent_income": 0.18,
    "cb_person_default_on_file": "N",
    "cb_person_cred_hist_length": 5
}])
In [68]:
#Use your trained pipeline to predict
In [70]:
xgb_pipeline.fit(X_train, y_train)
C:\User\p\Lib\site-packages\xgboost\training.py:183: UserWarning: [13:31:34] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Out[70]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')),
                                                 ('num', 'passthrough',
                                                  Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_perso...
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=None, n_jobs=None,
                               num_parallel_tree=None, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')),
                                                 ('num', 'passthrough',
                                                  Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_perso...
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=None, n_jobs=None,
                               num_parallel_tree=None, ...))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')),
                                ('num', 'passthrough',
                                 Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object'))])
Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')
OneHotEncoder(handle_unknown='ignore')
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')
passthrough
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)
In [71]:
# Predict class (0 = good loan, 1 = default)
prediction = xgb_pipeline.predict(new_applicant)

# Predict probability
probability = xgb_pipeline.predict_proba(new_applicant)

print("Prediction:", prediction[0])
print("Default Probability:", probability[0][1])
Prediction: 0
Default Probability: 0.043883912

Trying multiple applicant¶

In [72]:
new_applicants = pd.DataFrame([
    {
        "person_age": 30,
        "person_income": 55000,
        "person_home_ownership": "RENT",
        "person_emp_length": 24,
        "loan_intent": "EDUCATION",
        "loan_grade": "B",
        "loan_amnt": 10000,
        "loan_int_rate": 12.5,
        "loan_percent_income": 0.18,
        "cb_person_default_on_file": "N",
        "cb_person_cred_hist_length": 5
    },
    {
        "person_age": 45,
        "person_income": 30000,
        "person_home_ownership": "MORTGAGE",
        "person_emp_length": 60,
        "loan_intent": "MEDICAL",
        "loan_grade": "D",
        "loan_amnt": 25000,
        "loan_int_rate": 20.5,
        "loan_percent_income": 0.5,
        "cb_person_default_on_file": "Y",
        "cb_person_cred_hist_length": 10
    }
])

preds = xgb_pipeline.predict(new_applicants)
probs = xgb_pipeline.predict_proba(new_applicants)

results = pd.DataFrame({
    "Prediction": preds,
    "Default_Probability": probs[:, 1]
})
print(results)
   Prediction  Default_Probability
0           0             0.043685
1           1             0.960115
In [73]:
import joblib
joblib.dump(xgb_pipeline, "xgb_credit_model.pkl")
Out[73]:
['xgb_credit_model.pkl']
In [75]:
import pandas as pd

# Example: multiple new applicants
new_applicants = pd.DataFrame([
    {
        "person_age": 30,
        "person_income": 55000,
        "person_home_ownership": "RENT",
        "person_emp_length": 24,
        "loan_intent": "EDUCATION",
        "loan_grade": "B",
        "loan_amnt": 10000,
        "loan_int_rate": 12.5,
        "loan_percent_income": 0.18,
        "cb_person_default_on_file": "N",
        "cb_person_cred_hist_length": 5
    },
    {
        "person_age": 45,
        "person_income": 30000,
        "person_home_ownership": "MORTGAGE",
        "person_emp_length": 60,
        "loan_intent": "MEDICAL",
        "loan_grade": "D",
        "loan_amnt": 25000,
        "loan_int_rate": 20.5,
        "loan_percent_income": 0.5,
        "cb_person_default_on_file": "Y",
        "cb_person_cred_hist_length": 10
    },
    {
        "person_age": 22,
        "person_income": 45000,
        "person_home_ownership": "OWN",
        "person_emp_length": 12,
        "loan_intent": "PERSONAL",
        "loan_grade": "A",
        "loan_amnt": 5000,
        "loan_int_rate": 10.5,
        "loan_percent_income": 0.11,
        "cb_person_default_on_file": "N",
        "cb_person_cred_hist_length": 3
    }
])

# Get predictions (0 = good loan, 1 = default)
preds = xgb_pipeline.predict(new_applicants)

# Get probabilities
probs = xgb_pipeline.predict_proba(new_applicants)

# Combine results into a DataFrame
results = new_applicants.copy()
results["Prediction"] = preds
results["Default_Probability"] = probs[:, 1]

print(results)
   person_age  person_income person_home_ownership  person_emp_length  \
0          30          55000                  RENT                 24   
1          45          30000              MORTGAGE                 60   
2          22          45000                   OWN                 12   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0   EDUCATION          B      10000           12.5                 0.18   
1     MEDICAL          D      25000           20.5                 0.50   
2    PERSONAL          A       5000           10.5                 0.11   

  cb_person_default_on_file  cb_person_cred_hist_length  Prediction  \
0                         N                           5           0   
1                         Y                          10           1   
2                         N                           3           0   

   Default_Probability  
0             0.043685  
1             0.960115  
2             0.000190  
In [76]:
import matplotlib.pyplot as plt
import numpy as np

# Plot default probabilities
plt.figure(figsize=(8,5))
applicants = np.arange(len(results))  # index for applicants
plt.bar(applicants, results["Default_Probability"], color="orange")

# Add labels
plt.xticks(applicants, [f"Applicant {i+1}" for i in applicants])
plt.ylabel("Default Probability")
plt.title("Predicted Default Risk for Applicants")

# Add values on top of bars
for i, prob in enumerate(results["Default_Probability"]):
    plt.text(i, prob + 0.01, f"{prob:.2f}", ha="center", fontsize=10)

plt.ylim(0,1)  # probability scale (0 to 1)
plt.show()
No description has been provided for this image
In [77]:
import joblib

# Save the trained pipeline
joblib.dump(xgb_pipeline, "credit_risk_model.pkl")

print("Model saved as credit_risk_model.pkl")
Model saved as credit_risk_model.pkl
In [79]:
def predict_applicant(applicant_data):
    pred = xgb_pipeline.predict(applicant_data)[0]
    prob = xgb_pipeline.predict_proba(applicant_data)[0][1]
    
    result = "Default" if pred == 1 else "Good Loan"
    return {"Prediction": result, "Default Probability": round(prob, 4)}
In [80]:
# Example applicant (must match feature order/columns of your dataset)
new_applicant = df.drop("loan_status", axis=1).iloc[[0]]  

print(predict_applicant(new_applicant))
{'Prediction': 'Default', 'Default Probability': np.float32(0.9983)}
In [81]:
#Batch Prediction Function
In [82]:
import pandas as pd

def predict_batch(applicants_df):
    """
    Takes a DataFrame of applicants (same structure as training data, without loan_status).
    Returns predictions + default probabilities.
    """
    preds = xgb_pipeline.predict(applicants_df)
    probs = xgb_pipeline.predict_proba(applicants_df)[:, 1]  # probability of default
    
    results = applicants_df.copy()
    results["Prediction"] = preds
    results["Default_Probability"] = probs.round(4)
    results["Prediction_Label"] = results["Prediction"].map({0: "Good Loan", 1: "Default"})
    
    return results
In [83]:
# Example batch of applicants
new_applicants = pd.DataFrame([
    {
        "person_age": 30,
        "person_income": 55000,
        "person_home_ownership": "RENT",
        "person_emp_length": 24,
        "loan_intent": "EDUCATION",
        "loan_grade": "B",
        "loan_amnt": 10000,
        "loan_int_rate": 12.5,
        "loan_percent_income": 0.18,
        "cb_person_default_on_file": "N",
        "cb_person_cred_hist_length": 5
    },
    {
        "person_age": 45,
        "person_income": 30000,
        "person_home_ownership": "MORTGAGE",
        "person_emp_length": 60,
        "loan_intent": "MEDICAL",
        "loan_grade": "D",
        "loan_amnt": 25000,
        "loan_int_rate": 20.5,
        "loan_percent_income": 0.50,
        "cb_person_default_on_file": "Y",
        "cb_person_cred_hist_length": 10
    }
])

results = predict_batch(new_applicants)
print(results)
   person_age  person_income person_home_ownership  person_emp_length  \
0          30          55000                  RENT                 24   
1          45          30000              MORTGAGE                 60   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0   EDUCATION          B      10000           12.5                 0.18   
1     MEDICAL          D      25000           20.5                 0.50   

  cb_person_default_on_file  cb_person_cred_hist_length  Prediction  \
0                         N                           5           0   
1                         Y                          10           1   

   Default_Probability Prediction_Label  
0               0.0437        Good Loan  
1               0.9601          Default  
In [84]:
#Generate synthetic applicants
In [85]:
import numpy as np
import pandas as pd

np.random.seed(42)  # reproducibility

# Define possible categories from your dataset
home_ownership = ["RENT", "MORTGAGE", "OWN", "OTHER"]
loan_intents = ["EDUCATION", "MEDICAL", "VENTURE", "PERSONAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"]
loan_grades = ["A", "B", "C", "D", "E", "F", "G"]
default_file = ["Y", "N"]

# Generate 5000 applicants
new_5000 = pd.DataFrame({
    "person_age": np.random.randint(18, 70, 5000),
    "person_income": np.random.randint(20000, 120000, 5000),
    "person_home_ownership": np.random.choice(home_ownership, 5000),
    "person_emp_length": np.random.randint(0, 240, 5000),  # months
    "loan_intent": np.random.choice(loan_intents, 5000),
    "loan_grade": np.random.choice(loan_grades, 5000),
    "loan_amnt": np.random.randint(1000, 40000, 5000),
    "loan_int_rate": np.round(np.random.uniform(5, 30, 5000), 2),  # interest rates
    "loan_percent_income": np.round(np.random.uniform(0.05, 0.6, 5000), 2),
    "cb_person_default_on_file": np.random.choice(default_file, 5000),
    "cb_person_cred_hist_length": np.random.randint(1, 30, 5000)
})

print("✅ Synthetic dataset of 5000 applicants generated")
new_5000.head()
✅ Synthetic dataset of 5000 applicants generated
Out[85]:
person_age person_income person_home_ownership person_emp_length loan_intent loan_grade loan_amnt loan_int_rate loan_percent_income cb_person_default_on_file cb_person_cred_hist_length
0 56 21920 OTHER 113 PERSONAL F 29194 27.20 0.37 N 13
1 69 97219 OTHER 132 HOMEIMPROVEMENT D 34785 24.23 0.55 Y 29
2 46 96872 MORTGAGE 230 MEDICAL C 38458 11.91 0.38 N 27
3 32 101132 OWN 74 EDUCATION F 9373 11.04 0.18 N 7
4 60 22093 RENT 58 EDUCATION E 31283 25.96 0.23 N 21
In [86]:
results_5000 = predict_batch(new_5000)

print(results_5000.head())
print("✅ Total applicants scored:", len(results_5000))
   person_age  person_income person_home_ownership  person_emp_length  \
0          56          21920                 OTHER                113   
1          69          97219                 OTHER                132   
2          46          96872              MORTGAGE                230   
3          32         101132                   OWN                 74   
4          60          22093                  RENT                 58   

       loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0         PERSONAL          F      29194          27.20                 0.37   
1  HOMEIMPROVEMENT          D      34785          24.23                 0.55   
2          MEDICAL          C      38458          11.91                 0.38   
3        EDUCATION          F       9373          11.04                 0.18   
4        EDUCATION          E      31283          25.96                 0.23   

  cb_person_default_on_file  cb_person_cred_hist_length  Prediction  \
0                         N                          13           1   
1                         Y                          29           1   
2                         N                          27           0   
3                         N                           7           0   
4                         N                          21           1   

   Default_Probability Prediction_Label  
0               0.9850          Default  
1               0.9144          Default  
2               0.2375        Good Loan  
3               0.0004        Good Loan  
4               0.9815          Default  
✅ Total applicants scored: 5000
In [87]:
summary = results_5000["Prediction_Label"].value_counts(normalize=True) * 100
print("📊 Portfolio Risk Distribution (%):\n", summary)
📊 Portfolio Risk Distribution (%):
 Prediction_Label
Default      55.64
Good Loan    44.36
Name: proportion, dtype: float64
In [88]:
results_5000.to_csv("synthetic_credit_risk_5000.csv", index=False)
print("✅ Results saved to synthetic_credit_risk_5000.csv")
✅ Results saved to synthetic_credit_risk_5000.csv
In [ ]:
 
In [89]:
#Risk Distribution (Good vs Default)
In [90]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.countplot(x="Prediction_Label", data=results_5000, palette="Set2")
plt.title("Portfolio Risk Distribution (5000 Applicants)")
plt.xlabel("Loan Outcome")
plt.ylabel("Number of Applicants")
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\1746783086.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x="Prediction_Label", data=results_5000, palette="Set2")
No description has been provided for this image
In [91]:
#Default Probability Histogram
In [92]:
plt.figure(figsize=(8,5))
sns.histplot(results_5000["Default_Probability"], bins=20, kde=True, color="orange")
plt.title("Distribution of Default Probabilities")
plt.xlabel("Default Probability")
plt.ylabel("Number of Applicants")
plt.show()
No description has been provided for this image
In [93]:
#Age vs Default Risk
In [94]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Prediction_Label", y="person_age", data=results_5000, palette="coolwarm")
plt.title("Age Distribution by Loan Outcome")
plt.xlabel("Loan Outcome")
plt.ylabel("Age")
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\832474758.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Prediction_Label", y="person_age", data=results_5000, palette="coolwarm")
No description has been provided for this image
In [95]:
#Income vs Default Risk
In [96]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Prediction_Label", y="person_income", data=results_5000, palette="viridis")
plt.title("Income Distribution by Loan Outcome")
plt.xlabel("Loan Outcome")
plt.ylabel("Income")
plt.ylim(0, 150000)
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\86574630.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Prediction_Label", y="person_income", data=results_5000, palette="viridis")
No description has been provided for this image
In [97]:
#oan Amount vs Default Risk
In [98]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Prediction_Label", y="loan_amnt", data=results_5000, palette="mako")
plt.title("Loan Amount Distribution by Loan Outcome")
plt.xlabel("Loan Outcome")
plt.ylabel("Loan Amount")
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\277379129.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Prediction_Label", y="loan_amnt", data=results_5000, palette="mako")
No description has been provided for this image
In [99]:
pip install plotly
Requirement already satisfied: plotly in c:\user\p\lib\site-packages (5.24.1)
Requirement already satisfied: tenacity>=6.2.0 in c:\user\p\lib\site-packages (from plotly) (9.0.0)
Requirement already satisfied: packaging in c:\user\p\lib\site-packages (from plotly) (24.2)
Note: you may need to restart the kernel to use updated packages.
In [100]:
import plotly.express as px

fig = px.histogram(
    results_5000, 
    x="Prediction_Label", 
    color="Prediction_Label", 
    title="Portfolio Risk Distribution (5000 Applicants)",
    text_auto=True
)
fig.show()
In [101]:
fig = px.histogram(
    results_5000, 
    x="Default_Probability", 
    nbins=30, 
    title="Distribution of Default Probabilities",
    color="Prediction_Label"
)
fig.show()
In [102]:
fig = px.bar(
    results_5000.groupby("loan_intent")["Prediction"].mean().reset_index(),
    x="loan_intent",
    y="Prediction",
    title="Default Rate by Loan Intent",
    color="loan_intent"
)
fig.show()
In [103]:
fig = px.scatter(
    results_5000, 
    x="person_income", 
    y="Default_Probability", 
    color="Prediction_Label",
    size="loan_amnt",
    hover_data=["loan_intent", "loan_grade"],
    title="Income vs Default Probability (Loan Size = Bubble)"
)
fig.show()
In [104]:
fig = px.box(
    results_5000, 
    x="Prediction_Label", 
    y="person_age", 
    color="Prediction_Label",
    title="Age Distribution by Loan Outcome"
)
fig.show()
In [ ]: